# merge agency level military data with crime and control data
library(dplyr)
library(readr)
library(tidyr)
library(openintro)
library(foreign)
options(scipen=0, digits=7) #to standardize number of digits when exporting data. all data here is run using these options.

controlled_subset <- FALSE

if(controlled_subset){
  load("data/sum_LESO_agency_controlled.RData")
} else {
  load("data/sum_LESO_agency.RData")
}
military <- complete
rm(complete)

load("Data/city-crime-controls.RData")
crimecontrol$Id2 <- as.character(crimecontrol$Id2)

# 13,094 distinct city/state combinations each year
crimecontrol <- crimecontrol %>% 
  group_by(City,State) %>% 
  complete(year = full_seq(2009:2015, 1)) 

military_city <- military %>%
  group_by(city,State,year)%>%
  summarise_at(vars(contains("sum")), sum, na.rm=TRUE)

# join militarization and crime and controls datat
# obs increase because some cities in militarization data not in
# crimecontrol data
df_city <- full_join(crimecontrol,military_city, 
                     by=c("State"="State",
                          "City"="city",
                          "year"="year")) %>% 
  filter(year>2008 & year < 2016)
# Convert NAs to 0s
fix_NA <- grep("sum",names(df_city))
df_city[ ,fix_NA][is.na(df_city[ ,fix_NA])] <- 0

# a lot still don't match but they it is *mostly* because of different time periods in 
# the two datasets
make_log <- colnames(df_city)[grep("sumawards",colnames(df_city),ignore.case = T)]

df_city <- df_city %>%
  group_by(City,State,year)%>%
  mutate_at(make_log,funs(log = log(1+.))) %>% 
  arrange(State,City,year)

#merge that dataset with the IV's needed to conduct the analysis
##merge in the milex dataset
milex <- read_csv("raw-data/usmilex.csv")%>%
  mutate(lag_usmilex = dplyr::lag(usmilex,2)) 
# note this is lagged to t-2
df_city <- left_join(df_city,milex,
                     by=c("year"="year"))%>%
  mutate(ind = ifelse(sumquantity_BGWeapons!=0 | 
                        sumquantity_BGVehicles!=0 |
                        sumquantity_BGGears!=0 |
                        sumquantity_BGOthers!=0,1,0),
         ind = ifelse((year<2006), NA, ind),
         ind = if_else(is.na(ind), 0, ind))  

df_city <- df_city %>% 
  group_by(City,State) %>% 
  summarise(Aid1 = sum(ind))%>%
  left_join(df_city,.) %>%
  arrange(State,City,year)  

##need to create group indices for all agencies that could exist
df_city <- df_city %>% 
  group_by(City,State) %>% 
  mutate(g = group_indices())

#this forms our final dataset for the aggregated city dataset
df_city <- df_city %>%
  group_by(City,State) %>%
  mutate(g = as.factor(as.character(g)),
         milex_iv = Aid1/6*log(lag_usmilex),
         sharemales = popmales/Population, ###finally, create the variables used in B+G : share of males/ages, logged population, medianincome
         share1519 = pop1519/Population,
         share2024 = pop2024/Population,
         share2534 = pop2534/Population,
         logpop=log(Population),
         logmedianhouseholdincome = log(as.numeric(medianhouseholdincome)))%>%
  mutate_at(c('percblack',"PCPI","unemployment","percentinpoverty"),as.numeric) 

##SECOND, merge the instruments used in the Harris et al. piece-------
harris <- read_csv("data/cityHARRISinstruments.csv")%>%
  mutate(State = abbr2state(State),
         City = as.character(City))%>%
  unique()   

harris$City[grep('Flintridge',harris$City)] <- "La Canada Flintridge"
harris$City[grep('Pi<b1>on Hills',harris$City)] <- "Pinon Hills"
harris$City[harris$City=="Ca<b1>on City"] <- "Canon City"
harris$City[harris$City=="Ca<b1>ada de los Alamos"] <- "Canada de los Alamos"
harris$City[harris$City=="Ca<b1>on"] <- "Canon"
harris$City[harris$City=="Ca<b1>ones"] <- "Canones"

#merge all together: harris and the city variables
df_city$City <- gsub(",$",df_city$City,replacement = "")

# keep city that has smallest closest to FAC
harris_dupes <- harris %>%
  group_by(City,State) %>%
  tally() %>%
  #filter(n>1) %>%
  select(-n) %>%
  left_join(., harris[,c("City","State","closestFACs")])

# get min and then use an inner-join to exlclude
harris_dupes <- harris_dupes %>%
  group_by(City, State) %>%
  slice(which.min(closestFACs))

harris <- inner_join(harris,harris_dupes)

agency <- left_join(df_city,harris,
                        by=c("City"="City","State"="State"))

missing <- agency[is.na(agency$closestFACs),] #still missing 9463

# Make sure all military variables are 0 and not NA ------
fix_NA <- grep("sum",colnames(agency))
agency[ ,fix_NA][is.na(agency[ ,fix_NA])] <- 0

#---------------------------------------------------------------------------------------------#
#CREATE INSTRUMENTS WE WILL NEED FOR THE ANALYSIS FOR SUM US ITEMS AS DV. --------
#SumUSitems = the sum of all items given to all jurisdictions within a certain year.
#zHUdI1 = the interaction of the inverse distance to the closest distribution center with sumUSitems
#zHUdI6 = the interaction of the inverse distance to the sixth closest distribution center with sumUSitems
#zHUdIland = interaction between  the  availability  of  tactical  items  and  the  jurisdiction’s  land  area  
#zHUdIhidta = interaction between sumUSitems and  an  indicator  for  having  ever  been  designated  as  a  HIDTA
#---------------------------------------------------------------------------------------------#
SumUSitemsagency <- agency %>% 
  group_by(year)  %>%
  dplyr::summarize(SumUSitems=sum(sumallLESOquantityHARRIS,na.rm=T)) %>%
  as.data.frame()

agency <- left_join(agency,SumUSitemsagency)%>%
  mutate(zHUdI1 = (1/(closestFACs/1609.34))*SumUSitems,
         zHUdI6 = (1/(sixthclosestFACs/1609.34))*SumUSitems,
         zHUdIland = log(landareamiles) * SumUSitems,
         zHUdIhidta = IS_HIDTA * SumUSitems) 

#---------------------------------------------------------------------------------------------#
#CREATE INSTRUMENTS WE WILL NEED FOR THE ANALYSIS FOR SUM US VALUE AS DV. -------
#SumUSvalue = the sum of all items given to all jurisdictions within a certain year.
#zHUd1 = the interaction of the inverse distance to the closest distribution center with sumUSitems
#zHUd6 = the interaction of the inverse distance to the sixth closest distribution center with sumUSitems
#zHUdland = interaction between  the  availability  of  tactical  items  and  the  jurisdiction’s  land  area  
#zHUdhidta = interaction between sumUSitems and  an  indicator  for  having  ever  been  designated  as  a  HIDTA
#plus lags of other important variables
#---------------------------------------------------------------------------------------------#
SumUSvalueagency <- agency %>% 
  group_by(year)  %>%
  dplyr::summarize(SumUSvalue=sum(sumallLESOaidHARRIS,na.rm=T)) %>%
  as.data.frame() 

agency <- left_join(agency,SumUSvalueagency)%>%
  mutate(zHUd1 = (1/(closestFACs/1609.34))*SumUSvalue,
         zHUd6 = (1/(sixthclosestFACs/1609.34))*SumUSvalue,
         zHUdland = log(landareamiles) * SumUSvalue,
         zHUdhitda = IS_HIDTA * SumUSvalue,
         State_abbrv=state2abbr(State))%>% 
  rename(Larceny = `Larceny-theft`,Larceny_rate=`Larceny-theft_rate`) %>%
  group_by(City,State)%>%
  mutate_at(vars(zHUd1,zHUd6,zHUdland,zHUdhitda,SumUSvalue,
                 zHUdI1,zHUdI6,zHUdIland,zHUdIhidta,sumallLESOaidBG,
                 sumallLESOaidHARRIS,sumallLESOquantityHARRIS,
                 Murder_ArrestRate, Rape_ArrestRate, Robbery_ArrestRate,
                 Burglary_ArrestRate,Assault_ArrestRate,SumUSitems,SumUSvalue),
            funs(lag=dplyr::lag(.))) %>% 
  filter(year>2009) %>%
  arrange(State,City,year)   

# generate state*year fixed effect
# and numeric state variable we will need for stata
agency <- agency %>%
  group_by(State) %>%
  mutate(state_numeric = group_indices()) %>%
  group_by(State, year) %>%
  mutate(state_year = group_indices())

#but, there are some city-state duplicates (i.e 2 Warren, OH's)
#likely militarization data is coming from largest city, so we filter by out
#and take the one with the largest number of population
agency <- agency %>% 
 group_by(City,State,year) %>%
top_n(1, Population) 

if(controlled_subset){
  save(agency,file="data/city-aggregate-controlled.RData")
} else {
  save(agency,file="data/city-aggregate.RData")
  write.csv(agency,file="data/city-aggregate.csv")
}

#and, save for stata to be able to calculate the f-stats
agency$g <- as.numeric(agency$g)
agencystata <- agency %>%
  filter(allcrime_rate<1000000)

if(controlled_subset==FALSE){
  write.dta(agencystata, file='data/agency.dta')
}

